We already know how to extract useful information from data frames. Various statistics tell us a lot about the data. Nevertheless, values of mean, quantiles and standard deviations are inconvenient for understanding the whole picture.
We get the most information through our eyes, therefore the skill of presenting the data visually is on of the most powerful. By creating simple visualisations, you can make initial hypotheses and understand possible relationships between variables.
# install.packages('ggplot2')
library(ggplot2)
library(dplyr)
library(tidyr)
Today we will work with iris dataset. It id already included in R by default.
This famous (Fisher’s or Anderson’s) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica
# data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris_df = iris
iris_df$Sepal.Length.Category = cut(x=iris_df$Sepal.Length,
breaks = c(4,6,8),
labels = c('short', 'long'))
iris_df
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
## Sepal.Length.Category
## 1 short
## 2 short
## 3 short
## 4 short
## 5 short
## 6 short
## 7 short
## 8 short
## 9 short
## 10 short
## 11 short
## 12 short
## 13 short
## 14 short
## 15 short
## 16 short
## 17 short
## 18 short
## 19 short
## 20 short
## 21 short
## 22 short
## 23 short
## 24 short
## 25 short
## 26 short
## 27 short
## 28 short
## 29 short
## 30 short
## 31 short
## 32 short
## 33 short
## 34 short
## 35 short
## 36 short
## 37 short
## 38 short
## 39 short
## 40 short
## 41 short
## 42 short
## 43 short
## 44 short
## 45 short
## 46 short
## 47 short
## 48 short
## 49 short
## 50 short
## 51 long
## 52 long
## 53 long
## 54 short
## 55 long
## 56 short
## 57 long
## 58 short
## 59 long
## 60 short
## 61 short
## 62 short
## 63 short
## 64 long
## 65 short
## 66 long
## 67 short
## 68 short
## 69 long
## 70 short
## 71 short
## 72 long
## 73 long
## 74 long
## 75 long
## 76 long
## 77 long
## 78 long
## 79 short
## 80 short
## 81 short
## 82 short
## 83 short
## 84 short
## 85 short
## 86 short
## 87 long
## 88 long
## 89 short
## 90 short
## 91 short
## 92 long
## 93 short
## 94 short
## 95 short
## 96 short
## 97 short
## 98 long
## 99 short
## 100 short
## 101 long
## 102 short
## 103 long
## 104 long
## 105 long
## 106 long
## 107 short
## 108 long
## 109 long
## 110 long
## 111 long
## 112 long
## 113 long
## 114 short
## 115 short
## 116 long
## 117 long
## 118 long
## 119 long
## 120 short
## 121 long
## 122 short
## 123 long
## 124 long
## 125 long
## 126 long
## 127 long
## 128 long
## 129 long
## 130 long
## 131 long
## 132 long
## 133 long
## 134 long
## 135 long
## 136 long
## 137 long
## 138 long
## 139 short
## 140 long
## 141 long
## 142 long
## 143 short
## 144 long
## 145 long
## 146 long
## 147 long
## 148 long
## 149 long
## 150 short
The simplest graph is a points, each having x and y coordinates
x = 1:10
y = seq(2,20,2) ^ 2
x
## [1] 1 2 3 4 5 6 7 8 9 10
y
## [1] 4 16 36 64 100 144 196 256 324 400
We consider that the x and y coordinates at the same positions in each of the vectors correspond to one particular point. So we have points (1,2), (2,4), (3,6) and etc.
seq(from_value, to_value, by_value)- returns a vector
with values from from_value to to_valuewith a
step by_valueplot(x, y)
plot(iris_df$Sepal.Length, iris_df$Sepal.Width)
We can make this graph prettier
plot(iris_df$Sepal.Length, iris_df$Sepal.Width,
main = "Sepal.Length vs. Sepal.Width", # the title
xlab = "Sepal.Length", # Label of X-axis
ylab = "Sepal.Width", # Label of Y-axis
col = "blue", # color of plot
pch = 19, # type of dots - 19 corresponds to the painted (solid) points
cex = 1, # size of dots
)
By default plot() shows scatter plot, but we can change
this behavior
plot(x, y,type = 'l')
plot(x, y,type = 'b', lwd = 3) # lwd = linewidth
plot(x, y, type = 'o')
Line graphs should sometimes be avoided, because lines connect points in the order of their position in the vectors
x = c(10,5,9,6,8,7,2,1,4,3)
y = c(1:4, 6:8,5,10,9)
plot(x, y, type = "b", pch=19)
After sorting:
df = data.frame(x, y) %>% arrange(x)
plot(df$x, df$y,type = "b", pch=19)
# Advanced
# indexes_to_sort = order(x)
# x_sorted = x[indexes_to_sort]
# y_sorted = y[indexes_to_sort]
# plot(x_sorted, y_sorted, type = "b", pch=19)
NB! We can’t just sort one of the vectors because we must save the correspondence of x and y coordinates between two vectors.
heights = c(Roman = 190, Ann = 172, Charlie = 121) # named vector
barplot(heights)
We can make this graph prettier
barplot(heights,
main = "Heights of people", # the title
xlab = "Height", # Label of X-axis
ylab = "Name", # Label of Y-axis
col = "lightblue", # color of inner part of bars
border = "blue", # color of borders
horiz = TRUE, # make barplot horizontal
xlim = c(0,200) # limits of values showd on x-axis
)
number_of_species = table(iris_df$Species)
barplot(number_of_species,
col = 'violet')
Now we see that our data “is balanced”
hist(iris_df$Sepal.Length, # only x-axis!
main = "Sepal.Length distribution",
xlab = "Sepal.length",
col = "lightgreen",
breaks = 8 # number of x-axis splits for frequency calculation in each of the resulting ranges
)
Each bar represents frequency of iris_dfes with this
particular Sepal.Length.
For example, first bar have height of 5 - it mean, that there are 5
flowers with Sepal.Length between 4 and 4.5.
We can make bars two times thinner
hist(iris_df$Sepal.Length,
main = "Sepal.Length distribution",
xlab = "Sepal.length",
col = "lightgreen",
breaks = 16) # <---- changed
Boxplots are very informative charts. They display similar but more information than a histogram.
boxplot(iris_df$Sepal.Length,
# main = "Sepal.Length",
ylab = "",
xlab = "Sepal.Length",
col = "darkviolet",
horizontal = T)
boxplot(iris_df$Sepal.Length ~ iris_df$Species,
main = "Sepal.Length",
xlab = "Species",
ylab = "Sepal.Length",
col = "darkviolet",
horizontal = F)
Boxplot is good for unimodal similar to normaldistributions, as it doesn’t show two peaks
ggplot2 is the most popular package for charts
creations.
The ggplot is based on 3 things: data, aesthetics and geoms (geomertries)
Inside the geoms there are aesthetics.
Inside the aesthetics we put the variables from the data that we want to see in the plot. These will be our axes.
An axis is not just an x and y coordinate - any aesthetic, such as a fill,color,size etc. can also be an (pseudo)axis.
Each component in the graphic is added layer by layer
ggplot(data=iris_df, # data
mapping = aes(x=Sepal.Length, y=Sepal.Width)) + #aesthetics (axes)
geom_point() # geom
Obligatory components to create chart:
ggplot(data=iris_df) - data:
data.frame, tibble …ggplot(..., mappings=aes(x=Sepal.Length, y=Sepal.Width))
- aesthetics, which turned into x and y axis.+ geom_point() - geom, at least
one.ggplot(data=iris_df,
mapping = aes(x=Sepal.Length, y=Sepal.Width,
color=Species, size = Petal.Width,
shape=Sepal.Length.Category)) +
geom_point()
shape - shape of points
fill - filling color
stroke - stroke thickness
alpha - transparency
Geometries define the types of graphs in the diagram
iris_df %>%
ggplot(aes(x=Sepal.Width)) + # <--- here
geom_histogram(bins = 20, fill='lightblue', col='black')
iris_df %>%
ggplot() +
geom_histogram(aes(x=Sepal.Width),
bins = 20, fill='lightblue', col='black') # <--- here
aesthetics inside ggplot() are set for all
geoms
aesthetics inside geom_..() functions are set only
for this geom
iris_df %>%
ggplot(aes(x=Species, y=Sepal.Length)) +
geom_boxplot(aes(fill=Species)) +
geom_jitter(width=0.1)
As you have seen,
fill,color,size,shape,strokeandalphacan be seen outside ofaes(). They can also be set to a specific fixed value.
aes()iris_df %>%
ggplot(aes(x=Sepal.Length, y=Sepal.Width)) +
geom_point(aes(color=Species)) # <--- here
aes()ggplot(data=iris_df,
mapping = aes(x=Sepal.Length, y=Sepal.Width)) +
geom_point(color='blue') # <--- here
p = ggplot(data=iris_df,
mapping = aes(x=Sepal.Length, y=Sepal.Width, color=Species)) +
geom_point()
p
p + theme_bw()
p + theme_classic()
p + theme_void()
p = p + theme(axis.text = element_text(size = 15),
axis.title = element_text(size = 20),
panel.background = element_rect(fill = 'white', color='black'))
p
p + labs(x='Sepal length', y='Sepal width',
title = 'Scatter plot', subtitle = 'Subtitle',
caption = 'It is the great plot', tag = 'A')
Simple functions for labeling axes: + xlab() and
+ ylab()
iris_df %>% ggplot(aes(x=Petal.Length, y=Petal.Width)) +
geom_point() + theme_classic()
iris_df %>% ggplot(aes(x=Petal.Length)) +
geom_histogram(color='blue', fill='lightblue') +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
iris_df %>% ggplot(aes(x=Petal.Width, fill=Species)) +
geom_density(alpha=0.8) + theme_classic()
iris_df %>% ggplot(aes(x=Petal.Width, y=Species, fill=Species)) +
geom_boxplot() + theme_classic()
sepal_len_stats = iris_df %>% group_by(Species) %>%
summarise(mean_sepal_len = mean(Sepal.Length),
sd_sepal_length = sd(Sepal.Length))
sepal_len_stats
## # A tibble: 3 × 3
## Species mean_sepal_len sd_sepal_length
## <fct> <dbl> <dbl>
## 1 setosa 5.01 0.352
## 2 versicolor 5.94 0.516
## 3 virginica 6.59 0.636
geom_col - if heights of bars are known (continuous
data)p_col = sepal_len_stats %>%
ggplot(aes(x=Species, y=mean_sepal_len, fill=Species)) +
geom_col() + theme_classic()
p_col
geom_errorbarp_col + geom_errorbar(aes(ymin=mean_sepal_len-sd_sepal_length,
ymax=mean_sepal_len+sd_sepal_length),
width=0.3)
geom_bar - if heights are unknown (counting categorical
data)iris_df %>% ggplot(aes(x=Sepal.Length.Category,
fill=Sepal.Length.Category)) + geom_bar()
position="stack" (by default)iris_df %>% ggplot(aes(fill=Species, x=Sepal.Length.Category)) + geom_bar()
position="fill" - scaling from 0 to 1iris_df %>% ggplot(aes(fill=Species, x=Sepal.Length.Category)) +
geom_bar(position = 'fill')
position="dodge"iris_df %>% ggplot(aes(fill=Species, x=Sepal.Length.Category)) +
geom_bar(position = 'dodge')
# install.packages('ggpubr')
library(ggpubr)
iris_df %>% ggbarplot(x="Species", y="Sepal.Width",
fill='Species',
add = "mean_sd" # calculate mean and sd
)
p_pubr = iris_df %>% ggboxplot(x='Species', y='Sepal.Length', col='Species')
comparisons <- list( c("setosa", "versicolor"),
c("setosa", "virginica"),
c("virginica", "versicolor") )
p_pubr + stat_compare_means(comparisons = comparisons,
label = "p.signif")+
stat_compare_means(label.y = 10)
scale_color_manual()
scale_fill_manual()
facet_grid()
p + facet_grid()